Analyzing SAT Scores



In [17]:

    
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
plt.style.use('fivethirtyeight')
import math
import numpy as np

scores = pd.read_csv('sat_scores.csv')

scores.head(52)



In [5]:

    
scores.Verbal.value_counts()
scores.Math.value_counts()









    Out[5]:





499    6
510    3
515    3
542    3
550    2
501    2
589    2
527    1
474    1
599    1
596    1
514    1
570    1
525    1
526    1
582    1
517    1
516    1
577    1
513    1
539    1
551    1
545    1
572    1
603    1
488    1
489    1
554    1
580    1
561    1
562    1
500    1
553    1
439    1
568    1
505    1
506    1
512    1
Name: Math, dtype: int64

The data lists the rate of participation and mean verbal/math scores of students who took the SAT in 2001 broken down by states.

Initial Analysis:

From my initial observations I look at row 51 and assume that the 45 in the rate column (and 506-Verbal and 514-Math) is the mean of the data in rows 0-50. Iowa and North Dakota, which are in the bottom 5 for rate of participation (at 47th and 48th, respectively), have some of the highest Math and Verbal mean scores. So, rate of participation is not indicative of strong scores. Also, I want to ask the College Board if they want to rank it by participation rate, which is the current method. Or would they prefer ranking the states by highest mean Verbal and Math scores for their presentation this year?



In [159]:

    
scores.describe()



In [9]:

    
#Sorting Verbal scores in ascending order
scores.sort('Verbal')









    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  from ipykernel import kernelapp as app






    Out[9]:






  
    
      
      State
      Rate
      Verbal
      Math
    
  
  
    
      16
      DC
      56
      482
      474
    
    
      21
      HI
      52
      485
      515
    
    
      15
      SC
      57
      486
      488
    
    
      13
      GA
      63
      491
      489
    
    
      20
      TX
      53
      493
      499
    
    
      12
      NC
      65
      493
      499
    
    
      3
      NY
      77
      495
      505
    
    
      23
      CA
      51
      498
      517
    
    
      18
      FL
      54
      498
      499
    
    
      1
      NJ
      81
      499
      513
    
    
      14
      IN
      60
      499
      501
    
    
      6
      PA
      71
      500
      499
    
    
      5
      RI
      71
      501
      499
    
    
      10
      DE
      67
      501
      499
    
    
      51
      All
      45
      506
      514
    
    
      8
      ME
      69
      506
      500
    
    
      11
      MD
      65
      508
      510
    
    
      0
      CT
      82
      509
      510
    
    
      25
      NV
      33
      509
      515
    
    
      9
      VA
      68
      510
      501
    
    
      7
      VT
      69
      511
      506
    
    
      2
      MA
      79
      511
      515
    
    
      22
      AK
      51
      514
      510
    
    
      4
      NH
      72
      520
      516
    
    
      24
      AZ
      34
      523
      525
    
    
      17
      OR
      55
      526
      526
    
    
      19
      WA
      53
      527
      527
    
    
      29
      WV
      18
      527
      512
    
    
      27
      OH
      26
      534
      439
    
    
      26
      CO
      31
      539
      542
    
    
      28
      MT
      23
      539
      539
    
    
      30
      ID
      17
      543
      542
    
    
      35
      WY
      11
      547
      545
    
    
      34
      KY
      12
      550
      550
    
    
      32
      NM
      13
      551
      542
    
    
      39
      AL
      9
      559
      554
    
    
      36
      MI
      11
      561
      572
    
    
      31
      TN
      13
      562
      553
    
    
      45
      AR
      6
      562
      550
    
    
      40
      NE
      8
      562
      568
    
    
      43
      LA
      7
      564
      562
    
    
      50
      MS
      4
      566
      551
    
    
      41
      OK
      8
      567
      561
    
    
      46
      UT
      5
      575
      570
    
    
      33
      IL
      12
      576
      589
    
    
      38
      KS
      9
      577
      580
    
    
      48
      SD
      4
      577
      582
    
    
      42
      MO
      8
      577
      577
    
    
      37
      MN
      9
      580
      589
    
    
      44
      WI
      6
      584
      596
    
    
      49
      ND
      4
      592
      599
    
    
      47
      IA
      5
      593
      603

Creating a list of State names extracted from the data.



In [65]:

    
list(scores.State)









    Out[65]:





['CT',
 'NJ',
 'MA',
 'NY',
 'NH',
 'RI',
 'PA',
 'VT',
 'ME',
 'VA',
 'DE',
 'MD',
 'NC',
 'GA',
 'IN',
 'SC',
 'DC',
 'OR',
 'FL',
 'WA',
 'TX',
 'HI',
 'AK',
 'CA',
 'AZ',
 'NV',
 'CO',
 'OH',
 'MT',
 'WV',
 'ID',
 'TN',
 'NM',
 'IL',
 'KY',
 'WY',
 'MI',
 'MN',
 'KS',
 'AL',
 'NE',
 'OK',
 'MO',
 'LA',
 'WI',
 'AR',
 'UT',
 'IA',
 'SD',
 'ND',
 'MS',
 'All']



In [29]:

    
### Checking that types are accurate.
scores.dtypes









    Out[29]:





State     object
Rate       int64
Verbal     int64
Math       int64
dtype: object

Creating a dictionary for each column mapping the State to its respective value for that column.



In [32]:

    
drate = {}
dverbal = {}
dmath = {}

for x in scores.index:
    drate[scores.State.ix[x]] = scores.Rate.ix[x]
    dverbal[scores.State.ix[x]] = scores.Verbal.ix[x]
    dmath[scores.State.ix[x]] = scores.Math.ix[x]

print drate
print ''
print dverbal
print ''
print dmath









    



{'WA': 53, 'DE': 67, 'DC': 56, 'WI': 6, 'WV': 18, 'HI': 52, 'FL': 54, 'WY': 11, 'NH': 72, 'NJ': 81, 'NM': 13, 'TX': 53, 'LA': 7, 'NC': 65, 'ND': 4, 'NE': 8, 'TN': 13, 'NY': 77, 'PA': 71, 'RI': 71, 'NV': 33, 'VA': 68, 'CO': 31, 'AK': 51, 'AL': 9, 'AR': 6, 'VT': 69, 'IL': 12, 'GA': 63, 'IN': 60, 'IA': 5, 'OK': 8, 'AZ': 34, 'CA': 51, 'ID': 17, 'CT': 82, 'ME': 69, 'MD': 65, 'All': 45, 'MA': 79, 'OH': 26, 'UT': 5, 'MO': 8, 'MN': 9, 'MI': 11, 'KS': 9, 'MT': 23, 'MS': 4, 'SC': 57, 'KY': 12, 'OR': 55, 'SD': 4}

{'WA': 527, 'DE': 501, 'DC': 482, 'WI': 584, 'WV': 527, 'HI': 485, 'FL': 498, 'WY': 547, 'NH': 520, 'NJ': 499, 'NM': 551, 'TX': 493, 'LA': 564, 'NC': 493, 'ND': 592, 'NE': 562, 'TN': 562, 'NY': 495, 'PA': 500, 'RI': 501, 'NV': 509, 'VA': 510, 'CO': 539, 'AK': 514, 'AL': 559, 'AR': 562, 'VT': 511, 'IL': 576, 'GA': 491, 'IN': 499, 'IA': 593, 'OK': 567, 'AZ': 523, 'CA': 498, 'ID': 543, 'CT': 509, 'ME': 506, 'MD': 508, 'All': 506, 'MA': 511, 'OH': 534, 'UT': 575, 'MO': 577, 'MN': 580, 'MI': 561, 'KS': 577, 'MT': 539, 'MS': 566, 'SC': 486, 'KY': 550, 'OR': 526, 'SD': 577}

{'WA': 527, 'DE': 499, 'DC': 474, 'WI': 596, 'WV': 512, 'HI': 515, 'FL': 499, 'WY': 545, 'NH': 516, 'NJ': 513, 'NM': 542, 'TX': 499, 'LA': 562, 'NC': 499, 'ND': 599, 'NE': 568, 'TN': 553, 'NY': 505, 'PA': 499, 'RI': 499, 'NV': 515, 'VA': 501, 'CO': 542, 'AK': 510, 'AL': 554, 'AR': 550, 'VT': 506, 'IL': 589, 'GA': 489, 'IN': 501, 'IA': 603, 'OK': 561, 'AZ': 525, 'CA': 517, 'ID': 542, 'CT': 510, 'ME': 500, 'MD': 510, 'All': 514, 'MA': 515, 'OH': 439, 'UT': 570, 'MO': 577, 'MN': 589, 'MI': 572, 'KS': 580, 'MT': 539, 'MS': 551, 'SC': 488, 'KY': 550, 'OR': 526, 'SD': 582}

12. Print the min and max of each column



In [33]:

    
print 'The minimum Rate is' + ' ' + str(min(scores.Rate))
print 'The maximum Rate is' + ' ' + str(max(scores.Rate))
print 'The minimum Verbal score is' + ' ' + str(min(scores.Verbal))
print 'The maximum Verbal score is' + ' ' + str(max(scores.Verbal))
print 'The minimum Math score is' + ' ' + str(min(scores.Math))
print 'The maximum Math score is' + ' ' + str(max(scores.Math))









    



The minimum Rate is 4
The maximum Rate is 82
The minimum Verbal score is 482
The maximum Verbal score is 593
The minimum Math score is 439
The maximum Math score is 603

Writing a function using only list comprehensions, no loops, to compute Standard Deviation.

Printing the Standard Deviation of each numeric column (Rate, Verbal and Math).



In [38]:

    
def std(col):
    std = math.sqrt(sum((scores[col] - np.mean(scores[col])) ** 2) / (len(scores) - 1))
    return std

print('Standard Deviation for Rate of Participaion is ' + str(std('Rate')))
print('Standard Deviation for Average Verbal Score is ' + str(std('Verbal')))
print('Standard Deviation for Average Math Score is ' + str(std('Math')))









    



Standard Deviation for Rate of Participaion is 27.3017880729
Standard Deviation for Average Verbal Score is 33.2362254438
Standard Deviation for Average Math Score is 36.0149750989

Visualizing the data



In [27]:

    
scores.Rate.plot(kind='hist', bins=5, title='Histogram of Rates of Participation')









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x118de9290>



In [74]:

    
import matplotlib.pyplot as plt
% matplotlib inline
scores.Rate.order().values









    



/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: order is deprecated, use sort_values(...)
  app.launch_new_instance()






    Out[74]:





array([ 4,  4,  4,  5,  5,  6,  6,  7,  8,  8,  8,  9,  9,  9, 11, 11, 12,
       12, 13, 13, 17, 18, 23, 26, 31, 33, 34, 45, 51, 51, 52, 53, 53, 54,
       55, 56, 57, 60, 63, 65, 65, 67, 68, 69, 69, 71, 71, 72, 77, 79, 81,
       82])

15. Plot the Math distribution



In [18]:

    
scores.Math.plot(kind='hist', bins=6, title='Mean SAT Math Scores for 2001')
plt.xlabel('Math Score')
plt.ylabel('Frequency')









    Out[18]:





<matplotlib.text.Text at 0x11525ced0>

16. Plot the Verbal distribution



In [19]:

    
scores.Verbal.plot(kind='hist', bins=7, title='Mean Verbal SAT Scores for 2001')
plt.xlabel('Verbal Score')
plt.ylabel('Frequency')









    Out[19]:





<matplotlib.text.Text at 0x117d7e5d0>



In [20]:

    
#Checking out the data with a density plot
scores.Verbal.plot(kind='density', xlim=(300, 700))









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x117d87b10>

17. What is the typical assumption for data distribution?

Typical assumption for data distribution is a normal distribution

18. Does that distribution hold true for our data?

It does not hold true for the above 3 histograms representing Rate, Verbal and Math. Rate is high on both ends of the graph. Math peaks at the low 500s. While Verbal peaks around 510 and 570.

19. Plot some scatterplots. BONUS: Use a PyPlot `figure` to present multiple plots at once.



In [21]:

    
import matplotlib.pyplot as plt
scores.plot(kind='scatter', x='Verbal', y='Math', alpha=0.5)









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x1181ac910>



In [22]:

    
scores.plot(kind='scatter', x='Verbal', y='Rate', alpha=0.5)









    Out[22]:





<matplotlib.axes._subplots.AxesSubplot at 0x1184ccdd0>



In [23]:

    
scores.plot(kind='scatter', x='Math', y='Rate', alpha=0.5)









    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x11890c610>

20. Are there any interesting relationships to note?

Both the Verbal and Math mean scores were postively correlated. As one increases so did the other. In the scatterplot for Verbal and Rate chart, the higher scores were in the lowest rate of student participation. The scores were in closer proximity when compared to the scores displayed in the top left of the graph. As rates of participation increased the scores became more spreadout and lower. This held true (for the most part-excluding a couple or outliers) for the scatterplot for Math and Rate. This implies that higher rate of participation does not guarentee higher scores. The more student that take the SATs the more likely it is that the mean scores will decrease.

21. Create box plots for each variable.



In [24]:

    
scores.Rate.plot(kind='box')









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x1188fed90>



In [25]:

    
scores.Verbal.plot(kind='box')









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x118bc70d0>



In [26]:

    
scores.Math.plot(kind='box')









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x118e39ed0>

	Rate	Verbal	Math
count	52.000000	52.000000	52.000000
mean	37.153846	532.019231	531.500000
std	27.301788	33.236225	36.014975
min	4.000000	482.000000	439.000000
25%	9.000000	501.000000	504.000000
50%	33.500000	526.500000	521.000000
75%	63.500000	562.000000	555.750000
max	82.000000	593.000000	603.000000

	State	Rate	Verbal	Math
0	CT	82	509	510
1	NJ	81	499	513
2	MA	79	511	515
3	NY	77	495	505
4	NH	72	520	516
5	RI	71	501	499
6	PA	71	500	499
7	VT	69	511	506
8	ME	69	506	500
9	VA	68	510	501
10	DE	67	501	499
11	MD	65	508	510
12	NC	65	493	499
13	GA	63	491	489
14	IN	60	499	501
15	SC	57	486	488
16	DC	56	482	474
17	OR	55	526	526
18	FL	54	498	499
19	WA	53	527	527
20	TX	53	493	499
21	HI	52	485	515
22	AK	51	514	510
23	CA	51	498	517
24	AZ	34	523	525
25	NV	33	509	515
26	CO	31	539	542
27	OH	26	534	439
28	MT	23	539	539
29	WV	18	527	512
30	ID	17	543	542
31	TN	13	562	553
32	NM	13	551	542
33	IL	12	576	589
34	KY	12	550	550
35	WY	11	547	545
36	MI	11	561	572
37	MN	9	580	589
38	KS	9	577	580
39	AL	9	559	554
40	NE	8	562	568
41	OK	8	567	561
42	MO	8	577	577
43	LA	7	564	562
44	WI	6	584	596
45	AR	6	562	550
46	UT	5	575	570
47	IA	5	593	603
48	SD	4	577	582
49	ND	4	592	599
50	MS	4	566	551
51	All	45	506	514

	State	Rate	Verbal	Math
16	DC	56	482	474
21	HI	52	485	515
15	SC	57	486	488
13	GA	63	491	489
20	TX	53	493	499
12	NC	65	493	499
3	NY	77	495	505
23	CA	51	498	517
18	FL	54	498	499
1	NJ	81	499	513
14	IN	60	499	501
6	PA	71	500	499
5	RI	71	501	499
10	DE	67	501	499
51	All	45	506	514
8	ME	69	506	500
11	MD	65	508	510
0	CT	82	509	510
25	NV	33	509	515
9	VA	68	510	501
7	VT	69	511	506
2	MA	79	511	515
22	AK	51	514	510
4	NH	72	520	516
24	AZ	34	523	525
17	OR	55	526	526
19	WA	53	527	527
29	WV	18	527	512
27	OH	26	534	439
26	CO	31	539	542
28	MT	23	539	539
30	ID	17	543	542
35	WY	11	547	545
34	KY	12	550	550
32	NM	13	551	542
39	AL	9	559	554
36	MI	11	561	572
31	TN	13	562	553
45	AR	6	562	550
40	NE	8	562	568
43	LA	7	564	562
50	MS	4	566	551
41	OK	8	567	561
46	UT	5	575	570
33	IL	12	576	589
38	KS	9	577	580
48	SD	4	577	582
42	MO	8	577	577
37	MN	9	580	589
44	WI	6	584	596
49	ND	4	592	599
47	IA	5	593	603

	State	Rate	Verbal	Math
0	CT	82	509	510
1	NJ	81	499	513
2	MA	79	511	515
3	NY	77	495	505
4	NH	72	520	516
5	RI	71	501	499
6	PA	71	500	499
7	VT	69	511	506
8	ME	69	506	500
9	VA	68	510	501
10	DE	67	501	499
11	MD	65	508	510
12	NC	65	493	499
13	GA	63	491	489
14	IN	60	499	501
15	SC	57	486	488
16	DC	56	482	474
17	OR	55	526	526
18	FL	54	498	499
19	WA	53	527	527
20	TX	53	493	499
21	HI	52	485	515
22	AK	51	514	510
23	CA	51	498	517
24	AZ	34	523	525
25	NV	33	509	515
26	CO	31	539	542
27	OH	26	534	439
28	MT	23	539	539
29	WV	18	527	512
30	ID	17	543	542
31	TN	13	562	553
32	NM	13	551	542
33	IL	12	576	589
34	KY	12	550	550
35	WY	11	547	545
36	MI	11	561	572
37	MN	9	580	589
38	KS	9	577	580
39	AL	9	559	554
40	NE	8	562	568
41	OK	8	567	561
42	MO	8	577	577
43	LA	7	564	562
44	WI	6	584	596
45	AR	6	562	550
46	UT	5	575	570
47	IA	5	593	603
48	SD	4	577	582
49	ND	4	592	599
50	MS	4	566	551
51	All	45	506	514

	State	Rate	Verbal	Math
16	DC	56	482	474
21	HI	52	485	515
15	SC	57	486	488
13	GA	63	491	489
20	TX	53	493	499
12	NC	65	493	499
3	NY	77	495	505
23	CA	51	498	517
18	FL	54	498	499
1	NJ	81	499	513
14	IN	60	499	501
6	PA	71	500	499
5	RI	71	501	499
10	DE	67	501	499
51	All	45	506	514
8	ME	69	506	500
11	MD	65	508	510
0	CT	82	509	510
25	NV	33	509	515
9	VA	68	510	501
7	VT	69	511	506
2	MA	79	511	515
22	AK	51	514	510
4	NH	72	520	516
24	AZ	34	523	525
17	OR	55	526	526
19	WA	53	527	527
29	WV	18	527	512
27	OH	26	534	439
26	CO	31	539	542
28	MT	23	539	539
30	ID	17	543	542
35	WY	11	547	545
34	KY	12	550	550
32	NM	13	551	542
39	AL	9	559	554
36	MI	11	561	572
31	TN	13	562	553
45	AR	6	562	550
40	NE	8	562	568
43	LA	7	564	562
50	MS	4	566	551
41	OK	8	567	561
46	UT	5	575	570
33	IL	12	576	589
38	KS	9	577	580
48	SD	4	577	582
42	MO	8	577	577
37	MN	9	580	589
44	WI	6	584	596
49	ND	4	592	599
47	IA	5	593	603